library(rstudioapi)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loaded gam 1.20.2
library(splines)
library(splines2)
library(dplyr)
library(tidyr)
library(broom)
library(dslabs)
library(ggplot2)
library(ggthemes)
library(ggrepel)
# data cleaning for socio economic factors
soecon <- read_csv('soecon.csv')
## New names:
## Rows: 459 Columns: 11
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): Location dbl (10): ...1, year, SNAP, individual_homeless, fam_homeless,
## total_homeles...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
fulldat <- read_csv('FullData.csv')
## New names:
## Rows: 468 Columns: 42
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): LocationAbbr, LocationDesc, State.Name dbl (39): ...1, YearStart,
## DataValue, LowConfidenceLimit, HighConfidenceLimi...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(fulldat)[names(fulldat) == 'DataValue'] <- 'Asthmap'
names(fulldat)[names(fulldat) == 'TOBDataValue'] <- 'TOB'
names(soecon)
## [1] "...1" "Location" "year"
## [4] "SNAP" "individual_homeless" "fam_homeless"
## [7] "total_homeless" "unemployment_rate" "Male_poverty"
## [10] "Female_poverty" "Total_poverty"
Run a summary of the variables in socioeconomic category
summary(soecon)
## ...1 Location year SNAP
## Min. : 1.0 Length:459 Min. :2011 Min. : 2.431
## 1st Qu.:115.5 Class :character 1st Qu.:2013 1st Qu.: 18.913
## Median :230.0 Mode :character Median :2015 Median : 60.619
## Mean :230.0 Mean :2015 Mean : 85.846
## 3rd Qu.:344.5 3rd Qu.:2017 3rd Qu.: 93.756
## Max. :459.0 Max. :2019 Max. :441.777
## NA's :1
## individual_homeless fam_homeless total_homeless unemployment_rate
## Min. : 0.0320 Min. :0.00750 Min. : 0.0542 Min. : 2.100
## 1st Qu.: 0.1500 1st Qu.:0.07225 1st Qu.: 0.2377 1st Qu.: 3.900
## Median : 0.3505 Median :0.16880 Median : 0.5785 Median : 5.000
## Mean : 0.7216 Mean :0.39994 Mean : 1.1215 Mean : 5.471
## 3rd Qu.: 0.6794 3rd Qu.:0.39065 3rd Qu.: 1.0424 3rd Qu.: 6.700
## Max. :12.8777 Max. :5.21150 Max. :15.1278 Max. :15.800
##
## Male_poverty Female_poverty Total_poverty
## Min. :0.0570 Min. :0.0780 Min. :0.0700
## 1st Qu.:0.0980 1st Qu.:0.1250 1st Qu.:0.1120
## Median :0.1140 Median :0.1460 Median :0.1290
## Mean :0.1167 Mean :0.1508 Mean :0.1341
## 3rd Qu.:0.1360 3rd Qu.:0.1740 3rd Qu.:0.1545
## Max. :0.2020 Max. :0.2610 Max. :0.2230
##
Draw histograms of the variables in the dataset
par(mfrow = c(2,2))
hist(soecon$SNAP, main='SNAP enrollment across states', xlab = 'per 10,000 people' )
hist(log(soecon$SNAP))
hist(soecon$individual_homeless, main = 'Homeless individuals across states', xlab = 'per 10,000 people')
hist(log(soecon$individual_homeless))
hist(soecon$fam_homeless, main = 'Homeless in families across states', xlab = 'per 10,000 people')
hist(log(soecon$fam_homeless))
hist(soecon$total_homeless, main = 'Total homeless across states ', xlab = 'per 10,000 people')
hist(log(soecon$total_homeless))
hist(soecon$unemployment_rate, main='Unemployment rate across states', xlab = 'unemployment rate')
hist(soecon$Total_poverty, main='Total poverty across states', xlab = 'percentage of total population')
based on the histograms, most of the variables have skewed distribution, but after taking the logarithms of them, they tend to have more normal distribution. This is because most of the variables are population data, and the population values are big, but after taking the logarithms, the values will be more normally distributed.
pairs(soecon$total_homeless ~ soecon$SNAP + soecon$unemployment_rate + soecon$Total_poverty )
Based on the paired scatter plot, there is no obvious colinearity between the four main socioeconomic variables: SNAP enrollment, total_poverty rate, unemployment rate and total number of homeless people, which means that these four variables are independent with each other.
# for the homeless variable, test for colinearity between its three subvariables
pairs(soecon$total_homeless ~ soecon$individual_homeless + soecon$fam_homeless)
From the paired scatter plot, we can see that the total number of homeless people is correlated with number os homeless individuals and the number of homeless people in families, so these three variables are not independent with each other.
# for the homeless variable, test for colinearity between its three subvariables
pairs(soecon$Total_poverty ~ soecon$Male_poverty + soecon$Female_poverty)
By plotting the percentage of males and females that are in poverty in each state against each other, we can see that these variables are linearly correlated with each other and are not independent.
names(fulldat[, 1:25])
## [1] "...1" "YearStart"
## [3] "LocationAbbr" "LocationDesc"
## [5] "Asthmap" "LowConfidenceLimit"
## [7] "HighConfidenceLimit" "LocationID"
## [9] "TOB" "TOBLowConfidenceLimit"
## [11] "TOBHighConfidenceLimit" "AlcHeavyDataValue"
## [13] "AlcHeavyLowConfidenceLimit" "AlcHeavyHighConfidenceLimit"
## [15] "AlcBingeDataValue" "AlcBingeLowConfidenceLimit"
## [17] "AlcBingeHighConfidenceLimit" "ObeDataValue"
## [19] "ObeLowConfidenceLimit" "ObeHighConfidenceLimit"
## [21] "selfHlthDataValue" "selfHlthLowConfidenceLimit"
## [23] "selfHlthHighConfidenceLimit" "HlthCareDataValue"
## [25] "HlthCareLowConfidenceLimit"
summary(fulldat[, 4:25])
## LocationDesc Asthmap LowConfidenceLimit HighConfidenceLimit
## Length:468 Min. : 6.10 Min. : 4.200 Min. : 8.80
## Class :character 1st Qu.:10.70 1st Qu.: 8.500 1st Qu.:13.15
## Mode :character Median :12.10 Median : 9.800 Median :14.90
## Mean :12.31 Mean : 9.972 Mean :15.15
## 3rd Qu.:13.80 3rd Qu.:11.350 3rd Qu.:17.00
## Max. :22.20 Max. :18.000 Max. :27.00
## NA's :1 NA's :1 NA's :1
## LocationID TOB TOBLowConfidenceLimit TOBHighConfidenceLimit
## Min. : 1.00 Min. : 5.80 Min. : 4.80 Min. : 7.00
## 1st Qu.:16.75 1st Qu.:14.60 1st Qu.:11.95 1st Qu.:17.50
## Median :29.50 Median :18.50 Median :15.50 Median :21.80
## Mean :29.79 Mean :18.61 Mean :15.75 Mean :21.88
## 3rd Qu.:42.50 3rd Qu.:22.40 3rd Qu.:19.30 3rd Qu.:25.95
## Max. :72.00 Max. :38.30 Max. :34.50 Max. :42.20
## NA's :1 NA's :1 NA's :1
## AlcHeavyDataValue AlcHeavyLowConfidenceLimit AlcHeavyHighConfidenceLimit
## Min. : 2.500 Min. : 1.500 Min. : 3.300
## 1st Qu.: 5.200 1st Qu.: 3.700 1st Qu.: 7.200
## Median : 6.300 Median : 4.700 Median : 8.300
## Mean : 6.383 Mean : 4.722 Mean : 8.619
## 3rd Qu.: 7.300 3rd Qu.: 5.500 3rd Qu.: 9.700
## Max. :14.300 Max. :11.400 Max. :19.500
## NA's :3 NA's :3 NA's :3
## AlcBingeDataValue AlcBingeLowConfidenceLimit AlcBingeHighConfidenceLimit
## Min. : 8.40 Min. : 5.40 Min. :10.70
## 1st Qu.:15.20 1st Qu.:12.55 1st Qu.:18.20
## Median :17.90 Median :15.20 Median :21.30
## Mean :18.12 Mean :15.26 Mean :21.41
## 3rd Qu.:20.80 3rd Qu.:17.70 3rd Qu.:24.30
## Max. :32.80 Max. :27.30 Max. :39.70
## NA's :1 NA's :1 NA's :1
## ObeDataValue ObeLowConfidenceLimit ObeHighConfidenceLimit selfHlthDataValue
## Min. :39.40 Min. :34.60 Min. :43.00 Min. :78.50
## 1st Qu.:49.50 1st Qu.:45.20 1st Qu.:53.25 1st Qu.:85.85
## Median :53.40 Median :49.20 Median :57.60 Median :87.80
## Mean :53.41 Mean :49.28 Mean :57.48 Mean :87.57
## 3rd Qu.:57.70 3rd Qu.:53.60 3rd Qu.:61.55 3rd Qu.:89.55
## Max. :69.40 Max. :64.70 Max. :74.10 Max. :95.30
## NA's :1 NA's :1 NA's :1 NA's :1
## selfHlthLowConfidenceLimit selfHlthHighConfidenceLimit HlthCareDataValue
## Min. :72.70 Min. :81.5 Min. :59.60
## 1st Qu.:82.90 1st Qu.:88.3 1st Qu.:79.30
## Median :84.90 Median :90.0 Median :84.50
## Mean :84.73 Mean :89.9 Mean :83.61
## 3rd Qu.:87.20 3rd Qu.:91.6 3rd Qu.:88.90
## Max. :92.90 Max. :97.0 Max. :96.50
## NA's :1 NA's :1 NA's :1
## HlthCareLowConfidenceLimit
## Min. :56.40
## 1st Qu.:75.75
## Median :81.50
## Mean :80.47
## 3rd Qu.:85.90
## Max. :94.80
## NA's :1
hist(fulldat$Asthmap, main='Histogram of asthma crude prevalence
among women aged 18-44 years across states' )
hist(fulldat$TOB, main='Histogram of Percent current cigarette smoking
among women aged 18-44 years across states')
hist(fulldat$selfHlthDataValue, main = 'Histogram of Self-rated health status
among women aged 18-44 years across states')
Based on these histograms, we can see that most of the covariates here
are normally distributed with some skewness since they’re data from the
realworld.
# paired scatter plots of Asthma Prevalence and CDI variables
pairs(Asthmap ~ TOB + selfHlthDataValue + HlthCareDataValue, dat = fulldat)
Based on these paired comparisons, we can see that the asthma prevalence
has a relatively strong correlation with health care coverage data
compared to the other two covariates. It will likely be a confounder
that needs to be addressed.
#air pollutant exploratory
par(mfrow = c(3,2))
plot(fulldat$Asthmap ~ fulldat$o3_mean, ylab = "Asthma Pervalence", xlab = "Ozone")
plot(fulldat$Asthmap ~ fulldat$pm2.5_mean,ylab = "Asthma Pervalence", xlab = "PM2.5")
plot(fulldat$Asthmap ~ fulldat$so2_mean,ylab = "Asthma Pervalence", xlab = "Sulfur Dioxide")
plot(fulldat$Asthmap ~ fulldat$co_mean, ylab = "Asthma Pervalence", xlab = "Carbon Monoxide")
plot(fulldat$Asthmap ~ fulldat$no2_mean, ylab = "Asthma Pervalence", xlab = "Nitrogen Dioxide")
pairs(o3_mean ~ pm2.5_mean + so2_mean + co_mean + no2_mean, data = fulldat)
# paired scatter plots of Asthma Pervalence and socioeconomic variables
pairs(Asthmap ~ total_homeless + SNAP + unemployment_rate + Total_poverty, dat = fulldat)
From the paired scatterplot, the pervalence of Asthma is independent of the four main variables in the socioeconomic category.
par(mfrow = c(2,2))
scatter.smooth(fulldat$TOB, fulldat$Asthmap)
scatter.smooth(fulldat$selfHlthDataValue, fulldat$Asthmap)
scatter.smooth(fulldat$HlthCareDataValue, fulldat$Asthmap)
scatter.smooth(fulldat$SNAP, fulldat$Asthmap)
par(mfrow = c(2,2))
scatter.smooth(fulldat$individual_homeless, fulldat$Asthmap)
scatter.smooth(fulldat$fam_homeless, fulldat$Asthmap)
scatter.smooth(fulldat$total_homeless, fulldat$Asthmap)
scatter.smooth(fulldat$unemployment_rate, fulldat$Asthmap)
par(mfrow = c(2,2))
scatter.smooth(fulldat$Female_poverty, fulldat$Asthmap)
scatter.smooth(fulldat$Total_poverty, fulldat$Asthmap)
scatter.smooth(fulldat$o3_mean, fulldat$Asthmap)
scatter.smooth(fulldat$pm2.5_mean, fulldat$Asthmap)
par(mfrow = c(2,2))
scatter.smooth(fulldat$so2_mean, fulldat$Asthmap)
scatter.smooth(fulldat$co_mean, fulldat$Asthmap)
scatter.smooth(fulldat$no2_mean, fulldat$Asthmap)
We can see from the scatter plots that some crude values are heavily
clustered (especially socioeconomic data). This coincides with our
findings above in the socioeconomic section where the histograms are
heavily skewed. We may consider to take log of these data in our further
analyses.
# now we try to determine which variable is more significant in predicting the Asthma prevalence
# forward selection
coln <- colnames(fulldat)
coln
## [1] "...1" "YearStart"
## [3] "LocationAbbr" "LocationDesc"
## [5] "Asthmap" "LowConfidenceLimit"
## [7] "HighConfidenceLimit" "LocationID"
## [9] "TOB" "TOBLowConfidenceLimit"
## [11] "TOBHighConfidenceLimit" "AlcHeavyDataValue"
## [13] "AlcHeavyLowConfidenceLimit" "AlcHeavyHighConfidenceLimit"
## [15] "AlcBingeDataValue" "AlcBingeLowConfidenceLimit"
## [17] "AlcBingeHighConfidenceLimit" "ObeDataValue"
## [19] "ObeLowConfidenceLimit" "ObeHighConfidenceLimit"
## [21] "selfHlthDataValue" "selfHlthLowConfidenceLimit"
## [23] "selfHlthHighConfidenceLimit" "HlthCareDataValue"
## [25] "HlthCareLowConfidenceLimit" "HlthCareHighConfidenceLimit"
## [27] "X.x" "SNAP"
## [29] "individual_homeless" "fam_homeless"
## [31] "total_homeless" "unemployment_rate"
## [33] "Male_poverty" "Female_poverty"
## [35] "Total_poverty" "X.y"
## [37] "o3_mean" "pm2.5_mean"
## [39] "so2_mean" "co_mean"
## [41] "no2_mean" "State.Name"
fulldat <- fulldat |> drop_na()
require(broom)
#forward selection procedure using AIC values
lm1 <- lm(Asthmap ~ 1, data=fulldat)
stepModel <- step(lm1, direction="forward",
scope=(~ selfHlthDataValue + HlthCareDataValue + SNAP+ individual_homeless+fam_homeless+total_homeless+unemployment_rate+Male_poverty+Female_poverty+Total_poverty+o3_mean+pm2.5_mean+so2_mean+co_mean+no2_mean), data=fulldat)
## Start: AIC=730.7
## Asthmap ~ 1
##
## Df Sum of Sq RSS AIC
## + HlthCareDataValue 1 517.58 1843.0 627.51
## + SNAP 1 234.38 2126.2 688.26
## + Female_poverty 1 168.16 2192.5 701.30
## + Total_poverty 1 156.58 2204.1 703.54
## + individual_homeless 1 141.75 2218.9 706.39
## + Male_poverty 1 132.16 2228.5 708.22
## + total_homeless 1 109.11 2251.5 712.59
## + pm2.5_mean 1 48.19 2312.4 723.94
## + selfHlthDataValue 1 44.27 2316.4 724.66
## + fam_homeless 1 25.34 2335.3 728.12
## + unemployment_rate 1 25.17 2335.5 728.15
## + o3_mean 1 22.64 2338.0 728.61
## + so2_mean 1 18.38 2342.3 729.38
## + co_mean 1 17.73 2342.9 729.50
## + no2_mean 1 12.69 2347.9 730.41
## <none> 2360.6 730.70
##
## Step: AIC=627.51
## Asthmap ~ HlthCareDataValue
##
## Df Sum of Sq RSS AIC
## + individual_homeless 1 89.073 1754.0 608.46
## + total_homeless 1 74.204 1768.8 612.05
## + SNAP 1 73.805 1769.2 612.14
## + unemployment_rate 1 27.231 1815.8 623.19
## + fam_homeless 1 24.571 1818.5 623.81
## + so2_mean 1 9.664 1833.4 627.28
## <none> 1843.0 627.51
## + no2_mean 1 8.485 1834.6 627.55
## + Female_poverty 1 7.001 1836.0 627.89
## + Total_poverty 1 5.343 1837.7 628.28
## + selfHlthDataValue 1 4.820 1838.2 628.40
## + Male_poverty 1 3.225 1839.8 628.77
## + co_mean 1 1.851 1841.2 629.09
## + pm2.5_mean 1 0.235 1842.8 629.46
## + o3_mean 1 0.067 1843.0 629.50
##
## Step: AIC=608.46
## Asthmap ~ HlthCareDataValue + individual_homeless
##
## Df Sum of Sq RSS AIC
## + unemployment_rate 1 41.986 1712.0 600.16
## <none> 1754.0 608.46
## + selfHlthDataValue 1 7.988 1746.0 608.52
## + Female_poverty 1 7.629 1746.3 608.61
## + so2_mean 1 5.785 1748.2 609.06
## + Total_poverty 1 5.276 1748.7 609.18
## + SNAP 1 4.842 1749.1 609.28
## + pm2.5_mean 1 3.445 1750.5 609.62
## + fam_homeless 1 3.108 1750.9 609.71
## + total_homeless 1 3.108 1750.9 609.71
## + Male_poverty 1 2.576 1751.4 609.83
## + no2_mean 1 0.738 1753.2 610.28
## + co_mean 1 0.552 1753.4 610.33
## + o3_mean 1 0.031 1754.0 610.45
##
## Step: AIC=600.16
## Asthmap ~ HlthCareDataValue + individual_homeless + unemployment_rate
##
## Df Sum of Sq RSS AIC
## + Female_poverty 1 23.6828 1688.3 596.24
## + Total_poverty 1 21.3343 1690.7 596.83
## + Male_poverty 1 16.2212 1695.8 598.12
## + selfHlthDataValue 1 13.7980 1698.2 598.72
## + no2_mean 1 8.0745 1703.9 600.15
## <none> 1712.0 600.16
## + co_mean 1 6.0010 1706.0 600.67
## + SNAP 1 5.6554 1706.3 600.76
## + so2_mean 1 3.3985 1708.6 601.32
## + fam_homeless 1 1.0034 1711.0 601.91
## + total_homeless 1 1.0034 1711.0 601.91
## + pm2.5_mean 1 0.2104 1711.8 602.11
## + o3_mean 1 0.0011 1712.0 602.16
##
## Step: AIC=596.24
## Asthmap ~ HlthCareDataValue + individual_homeless + unemployment_rate +
## Female_poverty
##
## Df Sum of Sq RSS AIC
## + selfHlthDataValue 1 40.696 1647.6 587.87
## <none> 1688.3 596.24
## + no2_mean 1 7.661 1680.7 596.31
## + co_mean 1 6.069 1682.2 596.71
## + SNAP 1 3.006 1685.3 597.48
## + so2_mean 1 1.563 1686.8 597.85
## + pm2.5_mean 1 1.072 1687.2 597.97
## + Male_poverty 1 0.909 1687.4 598.01
## + Total_poverty 1 0.834 1687.5 598.03
## + fam_homeless 1 0.705 1687.6 598.06
## + total_homeless 1 0.705 1687.6 598.06
## + o3_mean 1 0.344 1688.0 598.16
##
## Step: AIC=587.87
## Asthmap ~ HlthCareDataValue + individual_homeless + unemployment_rate +
## Female_poverty + selfHlthDataValue
##
## Df Sum of Sq RSS AIC
## + no2_mean 1 23.7578 1623.8 583.70
## + co_mean 1 10.8609 1636.8 587.06
## <none> 1647.6 587.87
## + SNAP 1 5.9157 1641.7 588.34
## + so2_mean 1 1.9988 1645.6 589.36
## + Male_poverty 1 1.9390 1645.7 589.37
## + Total_poverty 1 1.6508 1646.0 589.45
## + fam_homeless 1 1.5058 1646.1 589.48
## + total_homeless 1 1.5058 1646.1 589.48
## + pm2.5_mean 1 0.6644 1647.0 589.70
## + o3_mean 1 0.3250 1647.3 589.79
##
## Step: AIC=583.7
## Asthmap ~ HlthCareDataValue + individual_homeless + unemployment_rate +
## Female_poverty + selfHlthDataValue + no2_mean
##
## Df Sum of Sq RSS AIC
## <none> 1623.8 583.70
## + pm2.5_mean 1 6.9724 1616.9 583.87
## + SNAP 1 3.9647 1619.9 584.66
## + Male_poverty 1 2.1987 1621.7 585.12
## + Total_poverty 1 2.1634 1621.7 585.13
## + so2_mean 1 1.4531 1622.4 585.32
## + co_mean 1 1.0612 1622.8 585.42
## + fam_homeless 1 0.9777 1622.9 585.44
## + total_homeless 1 0.9777 1622.9 585.44
## + o3_mean 1 0.1881 1623.7 585.65
From the results of forward selection, we can roughly have an idea on which covariates may have a stronger effect as a confounder. We can see that the variables that we may need to include in our final model are: health care coverage data, homeless percentage, unemployment rate, poverty rate, self-rated health status and \(NO_2\) level.